This notebook creates dataframes with monthly facility generation and fuel use data, merges them, and exports the results. The code assumes that you have already downloaded an ELEC.txt
file from EIA's bulk download website.
In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import io, time, json
import requests
from bs4 import BeautifulSoup
import pandas as pd
import urllib
import re
import os
import numpy as np
from scripts import facility_line_to_df
from joblib import Parallel, delayed
In [2]:
path = os.path.join('Raw data', 'Electricity data', '2017-03-15 ELEC.txt')
with open(path, 'rb') as f:
raw_txt = f.readlines()
In [3]:
gen_rows = [row for row in raw_txt if 'ELEC.PLANT.GEN' in row and 'series_id' in row and 'ALL.M' in row and 'ALL-' not in row]
total_fuel_rows = [row for row in raw_txt if 'ELEC.PLANT.CONS_TOT_BTU' in row and 'series_id' in row and 'ALL.M' in row and 'ALL-' not in row]
eg_fuel_rows = [row for row in raw_txt if 'ELEC.PLANT.CONS_EG_BTU' in row and 'series_id' in row and 'ALL.M' in row and 'ALL-' not in row]
In [4]:
if __name__ == '__main__':
exception_list = []
facility_gen = pd.concat(Parallel(n_jobs=-1)(delayed(facility_line_to_df)(json.loads(row)) for row in gen_rows))
facility_gen.reset_index(drop=True, inplace=True)
facility_gen.rename_axis({'value':'generation (MWh)'}, axis=1, inplace=True)
In [5]:
facility_gen.loc[:,'lat'] = facility_gen.loc[:,'lat'].astype(float)
facility_gen.loc[:,'lon'] = facility_gen.loc[:,'lon'].astype(float)
facility_gen.loc[:, 'plant id'] = facility_gen.loc[:, 'plant id'].astype(int)
In [6]:
#drop
facility_gen.tail()
Out[6]:
In [7]:
#drop
facility_gen['fuel'].unique(), facility_gen['prime mover'].unique()
Out[7]:
In [8]:
#drop
path = os.path.join('Clean data', 'Facility generation.csv')
facility_gen.to_csv(path, index=False)
In [9]:
if __name__ == '__main__':
exception_list = []
facility_all_fuel = pd.concat(Parallel(n_jobs=-1)(delayed(facility_line_to_df)(json.loads(row)) for row in total_fuel_rows))
facility_all_fuel.reset_index(drop=True, inplace=True)
facility_all_fuel.rename_axis({'value':'total fuel (mmbtu)'}, axis=1, inplace=True)
In [10]:
facility_all_fuel.loc[:,'lat'] = facility_all_fuel.loc[:,'lat'].astype(float)
facility_all_fuel.loc[:,'lon'] = facility_all_fuel.loc[:,'lon'].astype(float)
facility_all_fuel.loc[:,'plant id'] = facility_all_fuel.loc[:,'plant id'].astype(int)
In [11]:
#drop
facility_all_fuel.head()
Out[11]:
In [12]:
#drop
path = os.path.join('Clean data', 'Facility total fuel consumption.csv')
facility_all_fuel.to_csv(path, index=False)
In [13]:
if __name__ == '__main__':
exception_list = []
facility_eg_fuel = pd.concat(Parallel(n_jobs=-1)(delayed(facility_line_to_df)(json.loads(row)) for row in eg_fuel_rows))
facility_eg_fuel.reset_index(drop=True, inplace=True)
facility_eg_fuel.rename_axis({'value':'elec fuel (mmbtu)'}, axis=1, inplace=True)
In [14]:
facility_eg_fuel.loc[:,'lat'] = facility_eg_fuel.loc[:,'lat'].astype(float)
facility_eg_fuel.loc[:,'lon'] = facility_eg_fuel.loc[:,'lon'].astype(float)
facility_eg_fuel.loc[:,'plant id'] = facility_eg_fuel.loc[:,'plant id'].astype(int)
In [15]:
#drop
facility_eg_fuel.tail()
Out[15]:
In [16]:
#drop
path = os.path.join('Clean data', 'Facility electric fuel consumption.csv')
facility_eg_fuel.to_csv(path, index=False)
In [3]:
#drop
path1 = os.path.join('Clean data', 'Facility generation.csv')
path2 = os.path.join('Clean data', 'Facility total fuel consumption.csv')
path3 = os.path.join('Clean data', 'Facility electric fuel consumption.csv')
In [4]:
#drop
facility_gen = pd.read_csv(path1)
In [13]:
#drop
facility_gen.head()
Out[13]:
In [5]:
#drop
facility_all_fuel = pd.read_csv(path2)
In [6]:
#drop
facility_eg_fuel = pd.read_csv(path3)
In [17]:
keep_cols = ['fuel', 'generation (MWh)', 'month', 'plant id', 'prime mover', 'year',
'geography', 'lat', 'lon', 'last_updated']
merge_cols = ['fuel', 'month', 'plant id', 'year']
gen_total_fuel = facility_all_fuel.merge(facility_gen.loc[:,keep_cols],
how='outer', on=merge_cols)
In [18]:
#drop
gen_total_fuel.loc[gen_total_fuel['geography_x'].isnull()].head()
Out[18]:
In [19]:
def fill_missing(df):
cols = [col[:-2] for col in df.columns if '_x' in col]
# Create new column from the _x version, fill missing values from the _y version
for col in cols:
df[col] = df.loc[:, col + '_x']
df.loc[df[col].isnull(), col] = df.loc[df[col].isnull(), col + '_y']
df.drop([col+'_x', col+'_y'], axis=1, inplace=True)
In [20]:
fill_missing(gen_total_fuel)
In [21]:
#drop
gen_total_fuel.loc[gen_total_fuel['geography'].isnull()]
Out[21]:
In [22]:
keep_cols = ['fuel', 'elec fuel (mmbtu)', 'month', 'plant id', 'prime mover', 'year',
'geography', 'lat', 'lon', 'last_updated']
all_facility_data = gen_total_fuel.merge(facility_eg_fuel.loc[:,keep_cols],
how='outer', on=merge_cols)
In [23]:
fill_missing(all_facility_data)
In [24]:
#drop
all_facility_data.loc[all_facility_data['geography'].isnull()]
Out[24]:
In [25]:
all_facility_data.drop(['units', 'series_id'], axis=1, inplace=True)
all_facility_data.head()
Out[25]:
In [26]:
#drop
all_facility_data.dtypes
Out[26]:
In [27]:
all_facility_data['datetime'] = pd.to_datetime(all_facility_data['year'].astype(str) +
'-' + all_facility_data['month'].astype(str),
format='%Y-%m')
all_facility_data['quarter'] = all_facility_data['datetime'].dt.quarter
In [28]:
#drop
all_facility_data.head()
Out[28]:
These are mostly EIA emission factors
In [29]:
path = os.path.join('Clean data', 'Final emission factors.csv')
ef = pd.read_csv(path, index_col=0)
In [30]:
fossil_factors = dict(zip(ef.index, ef['Fossil Factor']))
total_factors = dict(zip(ef.index, ef['Total Factor']))
fossil_factors, total_factors
Out[30]:
In [31]:
# Start with 0 emissions in all rows
# For fuels where we have an emission factor, replace the 0 with the calculated value
all_facility_data['all fuel fossil CO2 (kg)'] = 0
all_facility_data['elec fuel fossil CO2 (kg)'] = 0
all_facility_data['all fuel total CO2 (kg)'] = 0
all_facility_data['elec fuel total CO2 (kg)'] = 0
for fuel in total_factors.keys():
# All fuel CO2 emissions
all_facility_data.loc[all_facility_data['fuel']==fuel,'all fuel fossil CO2 (kg)'] = \
all_facility_data.loc[all_facility_data['fuel']==fuel,'total fuel (mmbtu)'] * fossil_factors[fuel]
all_facility_data.loc[all_facility_data['fuel']==fuel,'all fuel total CO2 (kg)'] = \
all_facility_data.loc[all_facility_data['fuel']==fuel,'total fuel (mmbtu)'] * total_factors[fuel]
# Electric fuel CO2 emissions
all_facility_data.loc[all_facility_data['fuel']==fuel,'elec fuel fossil CO2 (kg)'] = \
all_facility_data.loc[all_facility_data['fuel']==fuel,'elec fuel (mmbtu)'] * fossil_factors[fuel]
all_facility_data.loc[all_facility_data['fuel']==fuel,'elec fuel total CO2 (kg)'] = \
all_facility_data.loc[all_facility_data['fuel']==fuel,'elec fuel (mmbtu)'] * total_factors[fuel]
In [32]:
# Fossil CO2
all_facility_data.loc[~(all_facility_data['all fuel fossil CO2 (kg)']>=0),
'all fuel fossil CO2 (kg)'] = 0
all_facility_data.loc[~(all_facility_data['elec fuel fossil CO2 (kg)']>=0),
'elec fuel fossil CO2 (kg)'] = 0
# Total CO2
all_facility_data.loc[~(all_facility_data['all fuel total CO2 (kg)']>=0),
'all fuel total CO2 (kg)'] = 0
all_facility_data.loc[~(all_facility_data['elec fuel total CO2 (kg)']>=0),
'elec fuel total CO2 (kg)'] = 0
In [33]:
path = os.path.join('Clean data', 'Facility gen fuels and CO2.csv')
all_facility_data.to_csv(path, index=False)
In [ ]: